library(data.table) # Efficient Dataframe
library(lubridate) # For Dates
library(tidyverse) # Multiple Package for Useful Data wrangling
library(esquisse) # Intuitive plotting
library(plyr) # Data splitting
library(dplyr) # Data Wrangling
library(ggplot2) # Plot Graphs
library(naniar) # for NA exploration in Dataframe
library(plotly) # Make ggplot2 Dynamic
library(gridExtra) # Multiple Plot at once
library(RColorBrewer) # For Color Palette
library(rmdformats) # Theme of HTML
library(flextable) # Show Table
library(class) # K-NN
library(summarytools) # Beautiful and Efficient Summary for Dataset
library(pivottabler) # Pivot Table
library(naivebayes) # Naive Bayes Function
library(caret) # Confusion Matrix
library(leaps) # Exhaustive Search
library(forecast) # Predictions
library(neuralnet) # Neural Network
library(nnet) # Neural Network
library(manipulateWidget) # Plotly Combiner
library(rpart) # Regression Tree
library(rpart.plot) # Plotting Regression TreeCereals <- fread("data/Cereals.csv")
eBay <- fread("data/eBayAuctions.csv")
UniversalBank <- fread("data/UniversalBank.csv")# Preview of the Dataset
DT::datatable(head(Cereals,2))dfSummary(Cereals,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.75,
valid.col = FALSE,
tmp.img.dir = "/tmp")Dimensions: 77 x 16
Duplicates: 0
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing |
|---|---|---|---|---|---|
| 1 | name [character] |
1. 100%_Bran 2. 100%_Natural_Bran 3. All-Bran 4. All-Bran_with_Extra_Fiber 5. Almond_Delight 6. Apple_Cinnamon_Cheerios 7. Apple_Jacks 8. Basic_4 9. Bran_Chex 10. Bran_Flakes [ 67 others ] |
1 ( 1.3%) 1 ( 1.3%) 1 ( 1.3%) 1 ( 1.3%) 1 ( 1.3%) 1 ( 1.3%) 1 ( 1.3%) 1 ( 1.3%) 1 ( 1.3%) 1 ( 1.3%) 67 (87.0%) |
0 (0.0%) |
|
| 2 | mfr [character] |
1. A 2. G 3. K 4. N 5. P 6. Q 7. R |
1 ( 1.3%) 22 (28.6%) 23 (29.9%) 6 ( 7.8%) 9 (11.7%) 8 (10.4%) 8 (10.4%) |
0 (0.0%) |
|
| 3 | type [character] |
1. C 2. H |
74 (96.1%) 3 ( 3.9%) |
0 (0.0%) |
|
| 4 | calories [integer] |
Mean (sd) : 106.9 (19.5) min < med < max: 50 < 110 < 160 IQR (CV) : 10 (0.2) |
11 distinct values | 0 (0.0%) |
|
| 5 | protein [integer] |
Mean (sd) : 2.5 (1.1) min < med < max: 1 < 3 < 6 IQR (CV) : 1 (0.4) |
1 : 13 (16.9%) 2 : 25 (32.5%) 3 : 28 (36.4%) 4 : 8 (10.4%) 5 : 1 ( 1.3%) 6 : 2 ( 2.6%) |
0 (0.0%) |
|
| 6 | fat [integer] |
Mean (sd) : 1 (1) min < med < max: 0 < 1 < 5 IQR (CV) : 2 (1) |
0 : 27 (35.1%) 1 : 30 (39.0%) 2 : 14 (18.2%) 3 : 5 ( 6.5%) 5 : 1 ( 1.3%) |
0 (0.0%) |
|
| 7 | sodium [integer] |
Mean (sd) : 159.7 (83.8) min < med < max: 0 < 180 < 320 IQR (CV) : 80 (0.5) |
27 distinct values | 0 (0.0%) |
|
| 8 | fiber [numeric] |
Mean (sd) : 2.2 (2.4) min < med < max: 0 < 2 < 14 IQR (CV) : 2 (1.1) |
13 distinct values | 0 (0.0%) |
|
| 9 | carbo [numeric] |
Mean (sd) : 14.8 (3.9) min < med < max: 5 < 14.5 < 23 IQR (CV) : 5 (0.3) |
21 distinct values | 1 (1.3%) |
|
| 10 | sugars [integer] |
Mean (sd) : 7 (4.4) min < med < max: 0 < 7 < 15 IQR (CV) : 8 (0.6) |
16 distinct values | 1 (1.3%) |
|
| 11 | potass [integer] |
Mean (sd) : 98.7 (70.4) min < med < max: 15 < 90 < 330 IQR (CV) : 77.5 (0.7) |
35 distinct values | 2 (2.6%) |
|
| 12 | vitamins [integer] |
Mean (sd) : 28.2 (22.3) min < med < max: 0 < 25 < 100 IQR (CV) : 0 (0.8) |
0 : 8 (10.4%) 25 : 63 (81.8%) 100 : 6 ( 7.8%) |
0 (0.0%) |
|
| 13 | shelf [integer] |
Mean (sd) : 2.2 (0.8) min < med < max: 1 < 2 < 3 IQR (CV) : 2 (0.4) |
1 : 20 (26.0%) 2 : 21 (27.3%) 3 : 36 (46.8%) |
0 (0.0%) |
|
| 14 | weight [numeric] |
Mean (sd) : 1 (0.2) min < med < max: 0.5 < 1 < 1.5 IQR (CV) : 0 (0.1) |
0.50 : 2 ( 2.6%) 0.83 : 1 ( 1.3%) 1.00 : 64 (83.1%) 1.25 : 2 ( 2.6%) 1.30 : 1 ( 1.3%) 1.33 : 5 ( 6.5%) 1.50 : 2 ( 2.6%) |
0 (0.0%) |
|
| 15 | cups [numeric] |
Mean (sd) : 0.8 (0.2) min < med < max: 0.2 < 0.8 < 1.5 IQR (CV) : 0.3 (0.3) |
12 distinct values | 0 (0.0%) |
|
| 16 | rating [numeric] |
Mean (sd) : 42.7 (14) min < med < max: 18 < 40.4 < 93.7 IQR (CV) : 17.7 (0.3) |
77 distinct values | 0 (0.0%) |
# Missing Variables Plot for the Dataset
gg_miss_var(Cereals, show_pct = TRUE)# NA % per variables
mean(is.na(Cereals$potass))
mean(is.na(Cereals$sugars))
mean(is.na(Cereals$carbo))
# NA % for whole dataseet
mean(is.na(Cereals))Cereals.csv has some missing values, such as Potass (2.6%), Sugars (1.3%) and Carbo (1.3%). The dataset has a proportion of 0.32% missing values.
# Preview of the Dataset
DT::datatable(head(eBay,2))dfSummary(eBay,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.75,
valid.col = FALSE,
tmp.img.dir = "/tmp")Dimensions: 1972 x 8
Duplicates: 346
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing |
|---|---|---|---|---|---|
| 1 | Category [character] |
1. Music/Movie/Game 2. Collectibles 3. Toys/Hobbies 4. Automotive 5. Antique/Art/Craft 6. SportingGoods 7. Clothing/Accessories 8. Home/Garden 9. Jewelry 10. Health/Beauty [ 8 others ] |
403 (20.4%) 239 (12.1%) 234 (11.9%) 178 ( 9.0%) 177 ( 9.0%) 124 ( 6.3%) 119 ( 6.0%) 102 ( 5.2%) 82 ( 4.2%) 64 ( 3.2%) 250 (12.7%) |
0 (0.0%) |
|
| 2 | currency [character] |
1. EUR 2. GBP 3. US |
533 (27.0%) 147 ( 7.5%) 1292 (65.5%) |
0 (0.0%) |
|
| 3 | sellerRating [integer] |
Mean (sd) : 3560.2 (5973) min < med < max: 0 < 1853 < 37727 IQR (CV) : 2785 (1.7) |
461 distinct values | 0 (0.0%) |
|
| 4 | Duration [integer] |
Mean (sd) : 6.5 (2.1) min < med < max: 1 < 7 < 10 IQR (CV) : 2 (0.3) |
1 : 23 ( 1.2%) 3 : 213 (10.8%) 5 : 466 (23.6%) 7 : 967 (49.0%) 10 : 303 (15.4%) |
0 (0.0%) |
|
| 5 | endDay [character] |
1. Fri 2. Mon 3. Sat 4. Sun 5. Thu 6. Tue 7. Wed |
287 (14.6%) 548 (27.8%) 351 (17.8%) 338 (17.1%) 202 (10.2%) 171 ( 8.7%) 75 ( 3.8%) |
0 (0.0%) |
|
| 6 | ClosePrice [numeric] |
Mean (sd) : 36.4 (89.5) min < med < max: 0 < 10 < 999 IQR (CV) : 23.1 (2.5) |
827 distinct values | 0 (0.0%) |
|
| 7 | OpenPrice [numeric] |
Mean (sd) : 12.9 (38.9) min < med < max: 0 < 4.5 < 999 IQR (CV) : 8.8 (3) |
291 distinct values | 0 (0.0%) |
|
| 8 | Competitive? [integer] |
Min : 0 Mean : 0.5 Max : 1 |
0 : 906 (45.9%) 1 : 1066 (54.1%) |
0 (0.0%) |
# Missing Variables Plot for the Dataset
gg_miss_var(eBay, show_pct = TRUE)eBayAuction.csv has no missing value.
# Preview of the Dataset
DT::datatable(head(eBay,2))dfSummary(eBay,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.75,
valid.col = FALSE,
tmp.img.dir = "/tmp")Dimensions: 1972 x 8
Duplicates: 346
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing |
|---|---|---|---|---|---|
| 1 | Category [character] |
1. Music/Movie/Game 2. Collectibles 3. Toys/Hobbies 4. Automotive 5. Antique/Art/Craft 6. SportingGoods 7. Clothing/Accessories 8. Home/Garden 9. Jewelry 10. Health/Beauty [ 8 others ] |
403 (20.4%) 239 (12.1%) 234 (11.9%) 178 ( 9.0%) 177 ( 9.0%) 124 ( 6.3%) 119 ( 6.0%) 102 ( 5.2%) 82 ( 4.2%) 64 ( 3.2%) 250 (12.7%) |
0 (0.0%) |
|
| 2 | currency [character] |
1. EUR 2. GBP 3. US |
533 (27.0%) 147 ( 7.5%) 1292 (65.5%) |
0 (0.0%) |
|
| 3 | sellerRating [integer] |
Mean (sd) : 3560.2 (5973) min < med < max: 0 < 1853 < 37727 IQR (CV) : 2785 (1.7) |
461 distinct values | 0 (0.0%) |
|
| 4 | Duration [integer] |
Mean (sd) : 6.5 (2.1) min < med < max: 1 < 7 < 10 IQR (CV) : 2 (0.3) |
1 : 23 ( 1.2%) 3 : 213 (10.8%) 5 : 466 (23.6%) 7 : 967 (49.0%) 10 : 303 (15.4%) |
0 (0.0%) |
|
| 5 | endDay [character] |
1. Fri 2. Mon 3. Sat 4. Sun 5. Thu 6. Tue 7. Wed |
287 (14.6%) 548 (27.8%) 351 (17.8%) 338 (17.1%) 202 (10.2%) 171 ( 8.7%) 75 ( 3.8%) |
0 (0.0%) |
|
| 6 | ClosePrice [numeric] |
Mean (sd) : 36.4 (89.5) min < med < max: 0 < 10 < 999 IQR (CV) : 23.1 (2.5) |
827 distinct values | 0 (0.0%) |
|
| 7 | OpenPrice [numeric] |
Mean (sd) : 12.9 (38.9) min < med < max: 0 < 4.5 < 999 IQR (CV) : 8.8 (3) |
291 distinct values | 0 (0.0%) |
|
| 8 | Competitive? [integer] |
Min : 0 Mean : 0.5 Max : 1 |
0 : 906 (45.9%) 1 : 1066 (54.1%) |
0 (0.0%) |
# Missing Variables Plot for the Dataset
gg_miss_var(eBay, show_pct = TRUE)UniversalBank.csv has no missing value.
# Loading Universal.csv
UniversalBank <- fread("data/UniversalBank.csv")
# Removing ZIP Code and ID
UniversalBank <- UniversalBank[,-c("ZIP Code","ID")]# Setting Seed
set.seed(1)
# Training and Validation Proportion
Training_Proportion <- 0.6
Validation_Proportion <- 1-Training_Proportion
# Splitting
sample <- sample(c(TRUE, FALSE), nrow(UniversalBank), replace=TRUE, prob=c(Training_Proportion,Validation_Proportion))
UB_Training <- UniversalBank[sample, ]
UB_Validation <- UniversalBank[!sample, ]
# Checking Proportions
Training_Proportion_Check <- nrow(UB_Training)/nrow(UniversalBank)
Validation_Proportion_Check <- nrow(UB_Validation)/nrow(UniversalBank)
# Printing Result Check
print(paste("Proportion in Training is", Training_Proportion_Check*100,"%", "and in Validation is",Validation_Proportion_Check*100,"%"))FALSE [1] "Proportion in Training is 61 % and in Validation is 39 %"
Use Personal Loan as the outcome variable. Report the validation confusion matrix for each of the three models.
# Set Seed
set.seed(1)
# Duplicate the UniversalBank
LR_Training <- UB_Training
LR_Validation <- UB_Validation
# Target Variable As Factor
LR_Training$`Personal Loan` <- factor(LR_Training$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan"))
LR_Validation$`Personal Loan` <- factor(LR_Validation$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan"))
# Factor Education
LR_Training$Education <- factor(LR_Training$Education, levels = c(1, 2, 3), labels = c("Undergrad", "Graduate", "Advanced/Professional"))
LR_Validation$Education <- factor(LR_Validation$Education, levels = c(1, 2, 3), labels = c("Undergrad", "Graduate", "Advanced/Professional"))
# Run logistic regression
logit.reg <- glm(`Personal Loan` ~ ., data = LR_Training, family = "binomial")
options(scipen=999)
summary(logit.reg)FALSE
FALSE Call:
FALSE glm(formula = `Personal Loan` ~ ., family = "binomial", data = LR_Training)
FALSE
FALSE Deviance Residuals:
FALSE Min 1Q Median 3Q Max
FALSE -2.1853 -0.1818 -0.0672 -0.0212 4.1974
FALSE
FALSE Coefficients:
FALSE Estimate Std. Error z value
FALSE (Intercept) -10.3177782 2.5770778 -4.004
FALSE Age -0.1295287 0.0971009 -1.334
FALSE Experience 0.1462520 0.0961873 1.520
FALSE Income 0.0589502 0.0038257 15.409
FALSE Family 0.6028245 0.1013175 5.950
FALSE CCAvg 0.2018200 0.0586504 3.441
FALSE EducationGraduate 3.9264743 0.3520393 11.154
FALSE EducationAdvanced/Professional 4.0665622 0.3382094 12.024
FALSE Mortgage 0.0012640 0.0007709 1.640
FALSE `Securities Account` -0.5440472 0.3724749 -1.461
FALSE `CD Account` 3.7374014 0.4324751 8.642
FALSE Online -0.5841282 0.2188906 -2.669
FALSE CreditCard -0.8865331 0.2775008 -3.195
FALSE Pr(>|z|)
FALSE (Intercept) 0.00006236641 ***
FALSE Age 0.182217
FALSE Experience 0.128387
FALSE Income < 0.0000000000000002 ***
FALSE Family 0.00000000268 ***
FALSE CCAvg 0.000579 ***
FALSE EducationGraduate < 0.0000000000000002 ***
FALSE EducationAdvanced/Professional < 0.0000000000000002 ***
FALSE Mortgage 0.101073
FALSE `Securities Account` 0.144118
FALSE `CD Account` < 0.0000000000000002 ***
FALSE Online 0.007617 **
FALSE CreditCard 0.001400 **
FALSE ---
FALSE Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FALSE
FALSE (Dispersion parameter for binomial family taken to be 1)
FALSE
FALSE Null deviance: 1947.62 on 3049 degrees of freedom
FALSE Residual deviance: 686.36 on 3037 degrees of freedom
FALSE AIC: 712.36
FALSE
FALSE Number of Fisher Scoring iterations: 8
# Set Seed
set.seed(1)
# Predictions with LR
logit.reg.pred <- predict(logit.reg, LR_Validation[, -c("`Personal Loan`")], type = "response")
# Rounding Predictions - 0.5 Threshold
logit.reg.pred_round <- round(logit.reg.pred)
# As Numeric
logit.reg.pred_round <- as.numeric(logit.reg.pred_round)
# Check rounding
dataframe_predictions <- cbind(logit.reg.pred, logit.reg.pred_round)
# Predictions as Factor
logit.reg.pred_round <- factor(logit.reg.pred_round, levels = c(0,1),labels = c("No Loan","Loan"))
# Confusion Matrix
Confusion_Matrix_LR <- confusionMatrix(data = logit.reg.pred_round, reference = LR_Validation$`Personal Loan`)
# Create the Function for Confusion Matrix
draw_confusion_matrix_LR <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Logistic Regression', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'No Loan', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Loan', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'No Loan', cex=1.2, srt=90)
text(140, 335, 'Loan', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_LR(Confusion_Matrix_LR)Comments:
# Set Seed
set.seed(1)
# Target Variable As Factor
UB_Training$`Personal Loan` <- factor(UB_Training$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan"))
UB_Validation$`Personal Loan` <- factor(UB_Validation$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan"))
# Education as Factor
UB_Training$Education <- factor(UB_Training$Education)
UB_Validation$Education <- factor(UB_Validation$Education)
# Library Caret
library(caret)
# Dummy for Education
dummy <- dummyVars(" ~ Education", data=UB_Training)
Education_Dummy <- data.frame(predict(dummy, newdata = UB_Training))
dummy_2 <- dummyVars(" ~ Education", data=UB_Validation)
Education_Dummy_2 <- data.frame(predict(dummy, newdata = UB_Validation))
# Remove Education and add hot-encoded dummies
UB_Training <- UB_Training[,-c("Education")]
UB_Training <- cbind(UB_Training,Education_Dummy)
UB_Validation <- UB_Validation[,-c("Education")]
UB_Validation <- cbind(UB_Validation,Education_Dummy_2)
# Duplicate Dataset for Preprocess
UB_Training_Preprocess <- UB_Training
UB_Validation_Preprocess <- UB_Validation
# Remove Dummies
UB_Training_Preprocess <- UB_Training_Preprocess[,-c("Personal Loan","Online","CreditCard","Securities Account","CD Account","Education.1","Education.2","Education.3")]
# Preprocess
norm_values <- preProcess(UB_Training_Preprocess, method = c("center","scale"))
UB_Training_Preprocess <- predict(norm_values,UB_Training)
UB_Validation_Preprocess <- predict(norm_values, UB_Validation)# Set Seed
set.seed(1)
# KNN Model using class package
library(class)
# Choosing our K value
k <- 3
# KNN
Predictions_KNN_Training <- knn(train=UB_Training_Preprocess[,-c("Personal Loan")], test = UB_Validation_Preprocess[,-c("Personal Loan")], cl = UB_Training_Preprocess$`Personal Loan`, k=k, prob = TRUE)
# KNN Probabilities output
Predictions_KNN_Training_Proba <- attributes(Predictions_KNN_Training)$prob
Predictions_KNN_Training_Proba <- as.data.frame(Predictions_KNN_Training_Proba)
# Change 1 to 0 since 1 is for Dummy == No Loan not Loan
Predictions_KNN_Training_Proba[Predictions_KNN_Training_Proba == 1] <- 0
# Confusion Matrix
Confusion_Matrix_KNN <- confusionMatrix(data = Predictions_KNN_Training, reference = UB_Validation$`Personal Loan`)
# Create the Function for Confusion Matrix
draw_confusion_matrix_KNN <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for KNN with k=3', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'No Loan', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Loan', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'No Loan', cex=1.2, srt=90)
text(140, 335, 'Loan', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_KNN(Confusion_Matrix_KNN)Comments:
# Set Seed
set.seed(1)
# Tree Packages
library(rpart)
library(rpart.plot)
# Running Tree
CT <- rpart(UB_Training$`Personal Loan` ~ . , data=UB_Training)
# Plotting Tree
CT_Plot <- rpart.plot(CT, type=0, varlen = 0, box.col=ifelse(CT$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE, main="Classification Trees", cex.main=1.5)# Set Seed
set.seed(1)
# Predictions
CT_Predictions <- predict(CT, UB_Validation, type = 'class')
CT_Predictions_Proba <- predict(CT, UB_Validation, type = 'prob')
# Confusion Matrix
Confusion_Matrix_CT <- confusionMatrix(data = CT_Predictions, reference = UB_Validation$`Personal Loan`)
# Create the Function for Confusion Matrix
draw_confusion_matrix_CT <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Classification Trees', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'No Loan', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Loan', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'No Loan', cex=1.2, srt=90)
text(140, 335, 'Loan', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_CT(Confusion_Matrix_CT)Comments:
Accuracy_CT <- Confusion_Matrix_CT$overall[1]
Accuracy_KNN <- Confusion_Matrix_KNN$overall[1]
Accuracy_LR <- Confusion_Matrix_LR$overall[1]
Accuracy_DF <- data.frame(Accuracy_CT,Accuracy_KNN,Accuracy_LR)
Accuracy_DF <- as.data.frame(Accuracy_DF)
# Plotting Accuracy
data.table(Accuracy_DF)FALSE Accuracy_CT Accuracy_KNN Accuracy_LR
FALSE 1: 0.9815385 0.9692308 0.9487179
Comments: The most accurate model is the Classification Trees, having at least 98% accurate predictions (Training against Validation), the lowest accuracy model is the Logistic Regression with 95% Accuracy.
# Creating Actual VERSUS Predicted Dataframe
LR_Dataframe <- data.frame(logit.reg.pred_round)
KNN_Dataframe <- data.frame(Predictions_KNN_Training)
CT_Dataframe <- data.frame(CT_Predictions)
All_Models_Predictions <- cbind(LR_Dataframe,KNN_Dataframe,CT_Dataframe, UB_Validation$`Personal Loan`)
# Changing Columns Name
colnames(All_Models_Predictions)[1] <- "Predicted - LR"
colnames(All_Models_Predictions)[2] <- "Predicted - KNN"
colnames(All_Models_Predictions)[3] <- "Predicted - CT"
colnames(All_Models_Predictions)[4] <- "Actual"
# Showing Dataframes
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation - 3 Models") # Majority Column
All_Models_Predictions$Majority <- 0
# As Numeric LR
All_Models_Predictions$`Predicted - LR` <- as.numeric(All_Models_Predictions$`Predicted - LR`)
All_Models_Predictions$`Predicted - LR` <- All_Models_Predictions$`Predicted - LR` -1
# As Numeric KNN
All_Models_Predictions$`Predicted - KNN` <- as.numeric(All_Models_Predictions$`Predicted - KNN`)
All_Models_Predictions$`Predicted - KNN` <- All_Models_Predictions$`Predicted - KNN` -1
# As Numeric CT
All_Models_Predictions$`Predicted - CT` <- as.numeric(All_Models_Predictions$`Predicted - CT`)
All_Models_Predictions$`Predicted - CT` <- All_Models_Predictions$`Predicted - CT` -1
# As Numeric Actual
All_Models_Predictions$Actual <- as.numeric(All_Models_Predictions$Actual)
All_Models_Predictions$Actual <- All_Models_Predictions$Actual -1
# Majority Vote
for(i in 1:nrow(All_Models_Predictions)){
if(sum(All_Models_Predictions[i,1] + All_Models_Predictions[i,2] + All_Models_Predictions[i,3])/3 > 0.5){
All_Models_Predictions[i,5] = 1
}
}
# As Factor again
All_Models_Predictions$`Predicted - LR` <- factor(All_Models_Predictions$`Predicted - LR`, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$`Predicted - KNN` <- factor(All_Models_Predictions$`Predicted - KNN`, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$`Predicted - CT` <- factor(All_Models_Predictions$`Predicted - CT`, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$Actual <- factor(All_Models_Predictions$Actual, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$Majority <- factor(All_Models_Predictions$Majority, levels = c(0,1),labels = c("No Loan","Loan"))
# Show Majority Column
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation (Majority Vote Added) - 3 Models") # Confusion Matrix
Confusion_Matrix_Majority <- confusionMatrix(data = All_Models_Predictions$Majority, reference = All_Models_Predictions$Actual)
# Create the Function for Confusion Matrix
draw_confusion_matrix_Majority <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Majority of Votes', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'No Loan', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Loan', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'No Loan', cex=1.2, srt=90)
text(140, 335, 'Loan', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_Majority(Confusion_Matrix_Majority)Comments: The Majority Vote has an accuracy of 97.2%, which is slighty better than RT and KNN but less accurate than CT.
All_Models_Predictions$Average <- NA
# Average Vote
for(i in 1:nrow(All_Models_Predictions)){
All_Models_Predictions$Average[i] = (CT_Predictions_Proba[i,2] + logit.reg.pred[i] + Predictions_KNN_Training_Proba[i,1])/3
}
# Round Average Vote to 4th Decimals
All_Models_Predictions$Average <- round(All_Models_Predictions[,6],4)
# Show Average Column with Probabilities
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation (Average Probabilities Added) - 3 Models") # Change Probabilities to 0 or 1 with cutoff = 0.5
for(i in 1:nrow(All_Models_Predictions)){
if(All_Models_Predictions[i,6] > 0.5){
All_Models_Predictions[i,6] = 1
}
else{
All_Models_Predictions[i,6] = 0
}
}
# As Factor Average
All_Models_Predictions$Average <- factor(All_Models_Predictions$Average, levels = c(0,1),labels = c("No Loan","Loan"))
# Show Average Column with Probabilities
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation (Average Added) - 3 Models") # Confusion Matrix
Confusion_Matrix_Average <- confusionMatrix(data = All_Models_Predictions$Average, reference = All_Models_Predictions$Actual)
# Create the Function for Confusion Matrix
draw_confusion_matrix_Average <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Average Predicted Probabilities', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'No Loan', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Loan', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'No Loan', cex=1.2, srt=90)
text(140, 335, 'Loan', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_Average(Confusion_Matrix_Average)Comments:
# Computing Error Rates
# Individual Methods
Error_CT <- 1-Accuracy_CT
Error_KNN <- 1-Accuracy_KNN
Error_LR <- 1-Accuracy_LR
# Ensembles Methods
Error_Majority <- 1-Confusion_Matrix_Majority$overall[1]
Error_Average <- 1-Confusion_Matrix_Average $overall[1]
# As Data frame
Error_Rates_DF <- cbind(Error_LR,Error_KNN,Error_CT,Error_Majority,Error_Average)
# Rounding up to 5 decimals
Error_Rates_DF <- round(Error_Rates_DF,3)
DT::datatable(Error_Rates_DF, caption = "Error Rates from 3 Individual Methods and 2 Ensembles Methods") Comments:
# Loading eBay Auctions
eBayAuction_DF <- fread("data/eBayAuctions.csv")# Setting Seed
set.seed(1)
# Training and Validation Proportion
Training_Proportion <- 0.6
Validation_Proportion <- 1-Training_Proportion
# Splitting
sample <- sample(c(TRUE, FALSE), nrow(eBayAuction_DF), replace=TRUE, prob=c(Training_Proportion,Validation_Proportion))
EB_Training <- eBayAuction_DF[sample, ]
EB_Validation <- eBayAuction_DF[!sample, ]
# Checking Proportions
Training_Proportion_Check <- nrow(EB_Training)/nrow(eBayAuction_DF)
Validation_Proportion_Check <- nrow(EB_Validation)/nrow(eBayAuction_DF)
# Printing Result Check
print(paste("Proportion in Training is", round(Training_Proportion_Check*100),"%", "and in Validation is",round(Validation_Proportion_Check*100),"%"))FALSE [1] "Proportion in Training is 61 % and in Validation is 39 %"
# Set Seed
set.seed(1)
# Tree Packages
library(rpart)
library(rpart.plot)
# Duplicate before Factor
EB_Training_Pure <- EB_Training
EB_Validation_Pure <- EB_Validation
# Factor for outcome variable
EB_Training$`Competitive?` <- factor(EB_Training$`Competitive?`, levels = c(0,1),labels = c("No","Yes"))
EB_Validation$`Competitive?` <- factor(EB_Validation$`Competitive?`, levels = c(0,1),labels = c("No","Yes"))
# Running Classification Tree
CT <- rpart(EB_Training$`Competitive?` ~ . , data=EB_Training)
# Predict for Validation Set with CT
CT_Predictions <- predict(CT, EB_Validation, type = 'class')
CT_Predictions_Prob <- predict(CT, EB_Validation, type = 'prob')
# Check Actual VS Predicted
Predictions_Actual_CT <- data.frame(EB_Validation$`Competitive?`,CT_Predictions)
# Renaming Columns
colnames(Predictions_Actual_CT)[2] <- "Predicted"
colnames(Predictions_Actual_CT)[1] <- "Actual"
# Render Table
DT::datatable(Predictions_Actual_CT, caption = "Actual VS Predicted with Default Classification Trees") # Set seed
set.seed(1)
# Confusion Matrix
Confusion_Matrix_CT_2 <- confusionMatrix(data = CT_Predictions, reference = EB_Validation$`Competitive?`)
# Create the Function for Confusion Matrix
draw_confusion_matrix_CT_2 <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Default Classification Trees', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'No', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Yes', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'No', cex=1.2, srt=90)
text(140, 335, 'Yes', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_CT_2(Confusion_Matrix_CT_2)Comments: The Overall Accuracy is 0.81 for this Default Classification Trees.
# Set seed
set.seed(1)
# Load Gains
library(gains)
# Compute Gains Chart
gain <- gains(EB_Validation_Pure$`Competitive?`, CT_Predictions_Prob[,2])
barplot(gain$mean.resp / mean(EB_Validation_Pure$`Competitive?`), names.arg = gain$depth, xlab = "Percentile",
ylab = "Mean Response", main = "Decile-wise lift chart", col = "#1c6155")Comments:
# Set seed
set.seed(1)
library(adabag)
#Model_Boost <- boosting(`Competitive?` ~ . , data = EB_Training_Pure)
#Prediction_Boost <- predict(Model_Boost, EB_Validation)Comments: The Overall Accuracy is …
Comments:
# Set seed
set.seed(1)
# Load Package
library(adabag)
# Predictions
bag <- bagging(`Competitive?` ~ ., data = EB_Training)
Prediction_Bagging <- predict(bag, EB_Validation, type="class")
Prediction_Bagging_Factor <- as.factor(Prediction_Bagging$class)
# Confusion Matrix
Confusion_Matrix_BT <- confusionMatrix(Prediction_Bagging_Factor, EB_Validation$`Competitive?`)
# Create the Function for Confusion Matrix
draw_confusion_matrix_BT <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Bagged Tree', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'No', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Yes', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'No', cex=1.2, srt=90)
text(140, 335, 'Yes', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_BT(Confusion_Matrix_BT)Comments: The Overall Accuracy is 0.853 for this Bagged Tree.
# Set seed
set.seed(1)
# Load Gains
library(gains)
# Compute Gains Chart
gain_2 <- gains(EB_Validation_Pure$`Competitive?`, Prediction_Bagging$prob[,2])
barplot(gain_2$mean.resp / mean(EB_Validation_Pure$`Competitive?`), names.arg = gain_2$depth, xlab = "Percentile",
ylab = "Mean Response", main = "Decile-wise lift chart", col = "#1c6155")Comments:
# Set seed
set.seed(1)
# Load Package
library(randomForest)
# Predictions
Random_Forest <- randomForest(EB_Training$`Competitive?` ~ ., data = EB_Training, mtry = 4)
Prediction_RF <- predict(Random_Forest, EB_Validation, type="class")
Prediction_RF_Prob <- predict(Random_Forest, EB_Validation, type="prob")
# Confusion Matrix
Confusion_Matrix_RF <- confusionMatrix(data = Prediction_RF, reference = EB_Validation$`Competitive?`)
# Create the Function for Confusion Matrix
draw_confusion_matrix_RF <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Random Forest', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'No', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Yes', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'No', cex=1.2, srt=90)
text(140, 335, 'Yes', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_RF(Confusion_Matrix_RF)Comments: The Overall Accuracy is 0.877 for this Random Forest.
# Load Gains
library(gains)
# Compute Gains Chart
gain_3 <- gains(EB_Validation_Pure$`Competitive?`, Prediction_RF_Prob[,2])
barplot(gain_3$mean.resp / mean(EB_Validation_Pure$`Competitive?`), names.arg = gain_3$depth, xlab = "Percentile",
ylab = "Mean Response", main = "Decile-wise lift chart", col = "#1c6155")Comments:
We can see that the bagged tree and the random forest are quite similar (0.853 VS 0.877). We can see that the first decile on both are also quite similar.
Remove all cereals with missing values
# Import Dataset Cereals.csv
Cereals <- fread("data/Cereals.csv")
Count_Before <- as.numeric(nrow(Cereals))
# NA Omit Cereals
Cereals <- na.omit(Cereals)
Count_After <- as.numeric(nrow(Cereals))
Count_NA <- Count_Before - Count_After
print(paste("We removed", Count_NA, "rows containing NA value"))FALSE [1] "We removed 3 rows containing NA value"
# Processing Dataset for Hclust
Scaling <- preProcess(Cereals, method = c("center","scale"))
Cereals_Preprocess <- predict(Scaling, Cereals)
# Computing Distance - Euclidean
Distance_Euclidian <- dist(Cereals_Preprocess, method = "euclidean")
# Computing Hierarchical Cluster - Single Linkage
hc1 <- hclust(Distance_Euclidian, method = "single")
plot(hc1, hang = -1, ann = TRUE)# Computing Hierarchical Cluster - Complete Linkage
hc2 <- hclust(Distance_Euclidian, method = "complete")
plot(hc2, hang = -1, ann = TRUE)# Computing Hierarchical Cluster - Centroid Linkage
hc3 <- hclust(Distance_Euclidian, method = "centroid")
plot(hc3, hang = -1, ann = TRUE)